# Loading all required libraries to perform Assignment.
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
# To build model for prediction
from sklearn.linear_model import LogisticRegression
# To tune different models
from sklearn.model_selection import GridSearchCV
# To get diferent metric scores
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
precision_recall_curve,
roc_curve,
make_scorer,
)
# Loading the dataset
plc = pd.read_csv('Loan_Modelling.csv')
plc.head(5)
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 25 | 1 | 49 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 45 | 19 | 34 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 3 | 39 | 15 | 11 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 4 | 35 | 9 | 100 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 5 | 35 | 8 | 45 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
plc.shape
(5000, 14)
plc.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
plc.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ID | 5000.0 | 2500.500000 | 1443.520003 | 1.0 | 1250.75 | 2500.5 | 3750.25 | 5000.0 |
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.00 | 45.0 | 55.00 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.00 | 20.0 | 30.00 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.00 | 64.0 | 98.00 | 224.0 |
| ZIPCode | 5000.0 | 93169.257000 | 1759.455086 | 90005.0 | 91911.00 | 93437.0 | 94608.00 | 96651.0 |
| Family | 5000.0 | 2.396400 | 1.147663 | 1.0 | 1.00 | 2.0 | 3.00 | 4.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.70 | 1.5 | 2.50 | 10.0 |
| Education | 5000.0 | 1.881000 | 0.839869 | 1.0 | 1.00 | 2.0 | 3.00 | 3.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.00 | 0.0 | 101.00 | 635.0 |
| Personal_Loan | 5000.0 | 0.096000 | 0.294621 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Securities_Account | 5000.0 | 0.104400 | 0.305809 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| CD_Account | 5000.0 | 0.060400 | 0.238250 | 0.0 | 0.00 | 0.0 | 0.00 | 1.0 |
| Online | 5000.0 | 0.596800 | 0.490589 | 0.0 | 0.00 | 1.0 | 1.00 | 1.0 |
| CreditCard | 5000.0 | 0.294000 | 0.455637 | 0.0 | 0.00 | 0.0 | 1.00 | 1.0 |
#checking missing values
plc.isnull().sum()
ID 0 Age 0 Experience 0 Income 0 ZIPCode 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 dtype: int64
No missing values in the data set.
print("Total rows having - sign in Experience column: " , plc[plc["Experience"] < 0]["Experience"].count())
plc[plc["Experience"] < 0]
Total rows having - sign in Experience column: 52
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 89 | 90 | 25 | -1 | 113 | 94303 | 4 | 2.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 226 | 227 | 24 | -1 | 39 | 94085 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 315 | 316 | 24 | -2 | 51 | 90630 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 451 | 452 | 28 | -2 | 48 | 94132 | 2 | 1.75 | 3 | 89 | 0 | 0 | 0 | 1 | 0 |
| 524 | 525 | 24 | -1 | 75 | 93014 | 4 | 0.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 536 | 537 | 25 | -1 | 43 | 92173 | 3 | 2.40 | 2 | 176 | 0 | 0 | 0 | 1 | 0 |
| 540 | 541 | 25 | -1 | 109 | 94010 | 4 | 2.30 | 3 | 314 | 0 | 0 | 0 | 1 | 0 |
| 576 | 577 | 25 | -1 | 48 | 92870 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 583 | 584 | 24 | -1 | 38 | 95045 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 597 | 598 | 24 | -2 | 125 | 92835 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 649 | 650 | 25 | -1 | 82 | 92677 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 670 | 671 | 23 | -1 | 61 | 92374 | 4 | 2.60 | 1 | 239 | 0 | 0 | 0 | 1 | 0 |
| 686 | 687 | 24 | -1 | 38 | 92612 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 793 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 889 | 890 | 24 | -2 | 82 | 91103 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 909 | 910 | 23 | -1 | 149 | 91709 | 1 | 6.33 | 1 | 305 | 0 | 0 | 0 | 0 | 1 |
| 1173 | 1174 | 24 | -1 | 35 | 94305 | 2 | 1.70 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1428 | 1429 | 25 | -1 | 21 | 94583 | 4 | 0.40 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 1522 | 1523 | 25 | -1 | 101 | 94720 | 4 | 2.30 | 3 | 256 | 0 | 0 | 0 | 0 | 1 |
| 1905 | 1906 | 25 | -1 | 112 | 92507 | 2 | 2.00 | 1 | 241 | 0 | 0 | 0 | 1 | 0 |
| 2102 | 2103 | 25 | -1 | 81 | 92647 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2430 | 2431 | 23 | -1 | 73 | 92120 | 4 | 2.60 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2466 | 2467 | 24 | -2 | 80 | 94105 | 2 | 1.60 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2545 | 2546 | 25 | -1 | 39 | 94720 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2618 | 2619 | 23 | -3 | 55 | 92704 | 3 | 2.40 | 2 | 145 | 0 | 0 | 0 | 1 | 0 |
| 2717 | 2718 | 23 | -2 | 45 | 95422 | 4 | 0.60 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2848 | 2849 | 24 | -1 | 78 | 94720 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2876 | 2877 | 24 | -2 | 80 | 91107 | 2 | 1.60 | 3 | 238 | 0 | 0 | 0 | 0 | 0 |
| 2962 | 2963 | 23 | -2 | 81 | 91711 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2980 | 2981 | 25 | -1 | 53 | 94305 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3076 | 3077 | 29 | -1 | 62 | 92672 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3130 | 3131 | 23 | -2 | 82 | 92152 | 2 | 1.80 | 2 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3157 | 3158 | 23 | -1 | 13 | 94720 | 4 | 1.00 | 1 | 84 | 0 | 0 | 0 | 1 | 0 |
| 3279 | 3280 | 26 | -1 | 44 | 94901 | 1 | 2.00 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3284 | 3285 | 25 | -1 | 101 | 95819 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3292 | 3293 | 25 | -1 | 13 | 95616 | 4 | 0.40 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3394 | 3395 | 25 | -1 | 113 | 90089 | 4 | 2.10 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3425 | 3426 | 23 | -1 | 12 | 91605 | 4 | 1.00 | 1 | 90 | 0 | 0 | 0 | 1 | 0 |
| 3626 | 3627 | 24 | -3 | 28 | 90089 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3796 | 3797 | 24 | -2 | 50 | 94920 | 3 | 2.40 | 2 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3824 | 3825 | 23 | -1 | 12 | 95064 | 4 | 1.00 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3887 | 3888 | 24 | -2 | 118 | 92634 | 2 | 7.20 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3946 | 3947 | 25 | -1 | 40 | 93117 | 3 | 2.40 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4015 | 4016 | 25 | -1 | 139 | 93106 | 2 | 2.00 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4088 | 4089 | 29 | -1 | 71 | 94801 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4116 | 4117 | 24 | -2 | 135 | 90065 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4285 | 4286 | 23 | -3 | 149 | 93555 | 2 | 7.20 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4411 | 4412 | 23 | -2 | 75 | 90291 | 2 | 1.80 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 4481 | 4482 | 25 | -2 | 35 | 95045 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4514 | 4515 | 24 | -3 | 41 | 91768 | 4 | 1.00 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4582 | 4583 | 25 | -1 | 69 | 92691 | 3 | 0.30 | 3 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4957 | 4958 | 29 | -1 | 50 | 95842 | 2 | 1.75 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
There are about 52 rows of experience column having with '-' sign. I treat them at later stage.
# Getting unique values on different columns
plc_columns = ["Securities_Account", "Personal_Loan", "CD_Account", "CreditCard", "Education", "Family","Online", "Mortgage"]
#printing the count
for i in plc_columns:
print(plc[i].value_counts())
print("*" * 50)
0 4478
1 522
Name: Securities_Account, dtype: int64
**************************************************
0 4520
1 480
Name: Personal_Loan, dtype: int64
**************************************************
0 4698
1 302
Name: CD_Account, dtype: int64
**************************************************
0 3530
1 1470
Name: CreditCard, dtype: int64
**************************************************
1 2096
3 1501
2 1403
Name: Education, dtype: int64
**************************************************
1 1472
2 1296
4 1222
3 1010
Name: Family, dtype: int64
**************************************************
1 2984
0 2016
Name: Online, dtype: int64
**************************************************
0 3462
98 17
119 16
89 16
91 16
...
547 1
458 1
505 1
361 1
541 1
Name: Mortgage, Length: 347, dtype: int64
**************************************************
# checking for unique values in ID column
plc["ID"].nunique()
5000
since ID is unique value, it is okay to drop the column
plc.drop(["ID"], axis=1, inplace=True)
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
histogram_boxplot(plc, "Age")
histogram_boxplot(plc, "Income")
histogram_boxplot(plc, "Mortgage")
histogram_boxplot(plc, "Experience")
histogram_boxplot(plc, "CCAvg")
#function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 2, 6))
else:
plt.figure(figsize=(n + 2, 6))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(plc, "Personal_Loan", perc=True)
labeled_barplot(plc, "Securities_Account", perc=True)
labeled_barplot(plc, "CD_Account", perc=True)
labeled_barplot(plc, "CreditCard", perc=True)
labeled_barplot(plc, "Education", perc=True)
#### 1. About 42% of the customers studied undergrade but about 60% have degree and higher studies.
labeled_barplot(plc, "Family", perc=True)
sns.pairplot(plc,diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x7fc049eaedf0>
### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
target_uniq = data[target].unique()
axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
sns.histplot(
data=data[data[target] == target_uniq[0]],
x=predictor,
kde=True,
ax=axs[0, 0],
color="teal",
stat="density",
)
axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
sns.histplot(
data=data[data[target] == target_uniq[1]],
x=predictor,
kde=True,
ax=axs[0, 1],
color="orange",
stat="density",
)
axs[1, 0].set_title("Boxplot w.r.t target")
sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
sns.boxplot(
data=data,
x=target,
y=predictor,
ax=axs[1, 1],
showfliers=False,
palette="gist_rainbow",
)
plt.tight_layout()
plt.show()
distribution_plot_wrt_target(plc, "Age", "Personal_Loan")
distribution_plot_wrt_target(plc, "CCAvg", "Personal_Loan")
distribution_plot_wrt_target(plc, "Income", "Personal_Loan")
distribution_plot_wrt_target(plc, "Experience", "Personal_Loan")
distribution_plot_wrt_target(plc, "Mortgage", "Personal_Loan")
I noticed that Income, Mortgage, CCAvg, Age , Family, Experience are having wide varity of values.From univarient and bivarient analysis noticed that Except Experience all other four variables are having greater impact on if customer is decided to take personal loans. The key question for this assignment is 'which variables are most significant'. IN order to find out this , I want to create a bins and use them as categories. To make my analysis more realistic I want to categorise them into 2-3 groups.
#checking min income of the customers
plc.Income.min()
8
Noted none of them are having 0 income. The minimum income is around 8K
#Getting count of different income group people.
print(len(plc[plc["Income"] == 0]))
income0to49 = len(plc[plc["Income"].between(1, 49)])
income50to99 = len(plc[plc["Income"].between(50, 99)])
Incomemorethan100 = len(plc[plc["Income"] >= 100])
print("Count of Income between 1-49K :" , income0to49)
print("Count of Income between 50 to 99K :" , income50to99 )
print("Count of Income greater than 100K:" , Incomemorethan100)
print ("Total count: " , (income0to49+ income50to99 + Incomemorethan100))
0 Count of Income between 1-49K : 1869 Count of Income between 50 to 99K : 1909 Count of Income greater than 100K: 1222 Total count: 5000
# Putting each customer into 3 different groups based on their income level
plc['Income'] = np.where(plc['Income'].between(0,49), 1 , plc['Income'])
plc['Income'] = np.where(plc['Income'].between(50,99), 2, plc['Income'])
plc['Income'] = np.where(plc['Income'] > 99, 3, plc['Income'])
plc.head(10)
| Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25 | 1 | 1 | 91107 | 4 | 1.6 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 45 | 19 | 1 | 90089 | 3 | 1.5 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 39 | 15 | 1 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 35 | 9 | 3 | 94112 | 1 | 2.7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 35 | 8 | 1 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| 5 | 37 | 13 | 1 | 92121 | 4 | 0.4 | 2 | 155 | 0 | 0 | 0 | 1 | 0 |
| 6 | 53 | 27 | 2 | 91711 | 2 | 1.5 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 50 | 24 | 1 | 93943 | 1 | 0.3 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | 35 | 10 | 2 | 90089 | 3 | 0.6 | 2 | 104 | 0 | 0 | 0 | 1 | 0 |
| 9 | 34 | 9 | 3 | 93023 | 1 | 8.9 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
#checking after creating groups
plc.groupby(['Income'])['Income'].count().sum()
5000
#Getting count of different income group people.
MortgageNo = len(plc[plc["Mortgage"] == 0])
MortgageYes = len(plc[plc["Mortgage"] > 0])
print("Count of customer who has no mortgage :" , MortgageNo)
print("Count of customer who has no mortgage : :" , MortgageYes)
print ("Total count: " , (MortgageNo+ MortgageYes))
Count of customer who has no mortgage : 3462 Count of customer who has no mortgage : : 1538 Total count: 5000
# Splitting customers into two buckets only.
plc.Mortgage = plc.Mortgage.apply(lambda x: 1 if x > 0 else x)
# Checking count again to confirm it is matching with 5000 total
plc.groupby(['Mortgage'])['Mortgage'].count()
Mortgage 0 3462 1 1538 Name: Mortgage, dtype: int64
#Getting count of different based on CCAvg.
CCAvg0to1 = len(plc[(plc['CCAvg'] >=0) & (plc['CCAvg'] <=1)])
CCAvg1to3 = len(plc[(plc['CCAvg'] > 1) & (plc['CCAvg'] <=3)])
CCAvgthan3 = len(plc[plc["CCAvg"] > 3])
print("Count of CCAvg between 0-1 :" , CCAvg0to1)
print("Count of CCAvg between 1-3 :" , CCAvg1to3 )
print("Count of CCAvg between more than 3:" , CCAvgthan3)
print ("Total count: " , (CCAvg0to1+ CCAvg1to3 + CCAvgthan3))
Count of CCAvg between 0-1 : 1914 Count of CCAvg between 1-3 : 2237 Count of CCAvg between more than 3: 849 Total count: 5000
This tells me that majority of the customers CCAvg is below or equal to 3k. Bank should consider customer who have CCAvg should offer incentive to convert these to personal loans.This is huge opportunity for a bank to raise their revenue.
# Splitting customers into three buckets based on their ccavg.
plc['CCAvg'] = np.where((plc['CCAvg'] >= 0) & (plc['CCAvg'] <=1), 1 , plc['CCAvg'])
plc['CCAvg'] = np.where((plc['CCAvg'] > 1) & (plc['CCAvg'] <=3), 2 , plc['CCAvg'])
plc['CCAvg'] = np.where((plc['CCAvg'] > 3) ,3 , plc['CCAvg'])
#Checking one more time the count
plc.groupby(['CCAvg'])['CCAvg'].count()
CCAvg 1.0 1914 2.0 2237 3.0 849 Name: CCAvg, dtype: int64
plc.Age.min()
23
#Getting count of different based on CCAvg.
Age0to1 = len(plc[(plc['Age'] >=22) & (plc['Age'] <=37)])
Age1to3 = len(plc[(plc['Age'] > 37) & (plc['Age'] <=55)])
Agethan3 = len(plc[plc["Age"] > 55])
print("Count of Age between 22-37 :" , Age0to1)
print("Count of Age between 37-55 :" , Age1to3 )
print("Count of Age between more than 55:" , Agethan3)
print ("Total count: " , (Age0to1+ Age1to3 + Agethan3))
Count of Age between 22-37 : 1487 Count of Age between 37-55 : 2297 Count of Age between more than 55: 1216 Total count: 5000
# Placing cusotmer into 3 groups.
plc['Age'] = np.where((plc['Age'] >= 22) & (plc['Age'] <=37), 1 , plc['Age'])
plc['Age'] = np.where((plc['Age'] > 37) & (plc['Age'] <=55), 2 , plc['Age'])
plc['Age'] = np.where((plc['Age'] > 55) ,3 , plc['Age'])
plc.head(10)
| Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 91107 | 4 | 2.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 1 | 2 | 19 | 1 | 90089 | 3 | 2.0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 2 | 15 | 1 | 94720 | 1 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 9 | 3 | 94112 | 1 | 2.0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1 | 8 | 1 | 91330 | 4 | 1.0 | 2 | 0 | 0 | 0 | 0 | 0 | 1 |
| 5 | 1 | 13 | 1 | 92121 | 4 | 1.0 | 2 | 1 | 0 | 0 | 0 | 1 | 0 |
| 6 | 2 | 27 | 2 | 91711 | 2 | 2.0 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 2 | 24 | 1 | 93943 | 1 | 1.0 | 3 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | 1 | 10 | 2 | 90089 | 3 | 1.0 | 2 | 1 | 0 | 0 | 0 | 1 | 0 |
| 9 | 1 | 9 | 3 | 93023 | 1 | 3.0 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
#checking the count after placing them into 3 buckets.
plc.groupby(['Age'])['Age'].count()
Age 1 1487 2 2297 3 1216 Name: Age, dtype: int64
Noted few of the Experience rows are having - in front of the value. It could be typo since the column allowed signs. Rather than removing all the rows, I want to remove sign and keep the rows.
#Converting to string type to remove -ve
plc['Experience'] = plc['Experience'].astype('str')
#Removing the -sgin from the Experiece rows
plc['Experience'] = plc['Experience'].str.replace('-', '')
#Checking to make sure the -ve sing it removed.
plc.Experience.str.startswith('-').value_counts()
#df.Experience.head(10)
False 5000 Name: Experience, dtype: int64
#Converting back to int type.
plc['Experience'] = plc['Experience'].astype('int64')
plc.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 5000 non-null int64 1 Experience 5000 non-null int64 2 Income 5000 non-null int64 3 ZIPCode 5000 non-null int64 4 Family 5000 non-null int64 5 CCAvg 5000 non-null float64 6 Education 5000 non-null int64 7 Mortgage 5000 non-null int64 8 Personal_Loan 5000 non-null int64 9 Securities_Account 5000 non-null int64 10 CD_Account 5000 non-null int64 11 Online 5000 non-null int64 12 CreditCard 5000 non-null int64 dtypes: float64(1), int64(12) memory usage: 507.9 KB
#Looking into unique values in ZIPCode column
plc['ZIPCode'].nunique()
467
There are about 467 unique zipcodes. Let me dig into more to get more insight into this column
zipcodes = plc['ZIPCode'].unique().tolist()
print (len(zipcodes))
from uszipcode import SearchEngine
engine = SearchEngine()
for zipcode in zipcodes:
x = engine.by_zipcode(zipcode)
if x is None:
print("Unknown")
else:
print(x.state_abbr)
467 CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA Unknown CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA Unknown CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA Unknown CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA Unknown CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA CA
Noted all these zipcodes are belong to 1 state that is CA. I would not worry much since there are many unique and that too belong to same state. I inclined to drop this column.
plt.figure(figsize=(15, 7))
sns.heatmap(plc.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
There is no significant correlation between the columns. However income,CCAvg, CD_Account have moderate influence on the Personal_loan.
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(plc, "Income", "Personal_Loan")
Personal_Loan 0 1 All Income All 4520 480 5000 3 783 439 1222 2 1868 41 1909 1 1869 0 1869 ------------------------------------------------------------------------------------------------------------------------
It is better to target both 3 and 2income group Customers
stacked_barplot(plc, "Mortgage", "Personal_Loan")
Personal_Loan 0 1 All Mortgage All 4520 480 5000 0 3150 312 3462 1 1370 168 1538 ------------------------------------------------------------------------------------------------------------------------
The graph and past compaing clearly shows to target customers who have 0 mortgage.
stacked_barplot(plc, "Education", "Personal_Loan")
Personal_Loan 0 1 All Education All 4520 480 5000 3 1296 205 1501 2 1221 182 1403 1 2003 93 2096 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(plc, "CCAvg", "Personal_Loan")
Personal_Loan 0 1 All CCAvg All 4520 480 5000 3.0 536 313 849 2.0 2119 118 2237 1.0 1865 49 1914 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(plc, "Age", "Personal_Loan")
Personal_Loan 0 1 All Age All 4520 480 5000 2 2089 208 2297 1 1329 158 1487 3 1102 114 1216 ------------------------------------------------------------------------------------------------------------------------
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.barplot(data=plc, y = "Age", x = "Income")
plt.xticks(rotation = 45)
plt.subplot(1,2,2)
sns.boxplot(data=plc, y = "Experience", x = "Personal_Loan")
plt.xticks(rotation = 45)
plt.show()
columns = list(plc)[0:-1] # Excluding Outcome column
plc[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
# Histogram of each columns
I realyzed some of the rows from Experience column has -ve sign infront. Considered it is a typo and removed it. I also noted Customers with higher income and who spend more money on credit cards are tend to take personal loans regardless of their age and experience. Also noted More memeber in the family are tend to take more loans. Since income, age, CCAvg, Mortgage are having wide varity of values, I put them into different bins. I noticed ZIPCode has about 467 unique values and I tend to drop this column and experiment with rest of the columns.
plc.drop(["ZIPCode"], axis=1, inplace=True)
numerical_col = plc.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(20, 30))
for i, variable in enumerate(numerical_col):
plt.subplot(5, 4, i + 1)
plt.boxplot(plc[variable], whis=1.5)
plt.tight_layout()
plt.title(variable)
plt.show()
# creating dummy varibles
dummy_data = pd.get_dummies(
plc,
columns=[
"Income",
"Age",
"CCAvg",
"Mortgage",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
"Education",
"Family"
],
drop_first=True,
)
dummy_data.head()
| Experience | Personal_Loan | Income_2 | Income_3 | Age_2 | Age_3 | CCAvg_2.0 | CCAvg_3.0 | Mortgage_1 | Securities_Account_1 | CD_Account_1 | Online_1 | CreditCard_1 | Education_2 | Education_3 | Family_2 | Family_3 | Family_4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 19 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 15 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
X = dummy_data.drop("Personal_Loan", axis=1)
Y = dummy_data["Personal_Loan"]
# creating dummy variables
#X = pd.get_dummies(X, drop_first=True)
# splitting in training and test set
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=50
)
print(X_train.shape, X_test.shape)
(3500, 17) (1500, 17)
print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Shape of Training set : (3500, 17) Shape of test set : (1500, 17) Percentage of classes in training set: 0 0.903429 1 0.096571 Name: Personal_Loan, dtype: float64 Percentage of classes in test set: 0 0.905333 1 0.094667 Name: Personal_Loan, dtype: float64
Model can make wrong predictions as: Predicting a bank can consider customer to issue loan when he/she may not qualify. Predicting a bank can consider customer not to issue a loan when he/she may qualify.
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
Function to compute different metrics, based on the threshold specified, to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
# predicting using the independent variables
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
pred = np.round(pred_thres)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn_with_threshold(model, predictors, target, threshold=0.5):
"""
To plot the confusion_matrix, based on the threshold specified, with percentages
model: classifier
predictors: independent variables
target: dependent variable
threshold: threshold for classifying the observation as class 1
"""
pred_prob = model.predict_proba(predictors)[:, 1]
pred_thres = pred_prob > threshold
y_pred = np.round(pred_thres)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
# There are different solvers available in Sklearn logistic regression
# The newton-cg solver is faster for high-dimensional data
model = LogisticRegression(solver="newton-cg", random_state=1)
lg = model.fit(X_train, y_train)
# let us check the coefficients and intercept of the model
log_odds = lg.coef_[0]
pd.DataFrame(log_odds, X_train.columns, columns=["coef"]).T
| Experience | Income_2 | Income_3 | Age_2 | Age_3 | CCAvg_2.0 | CCAvg_3.0 | Mortgage_1 | Securities_Account_1 | CD_Account_1 | Online_1 | CreditCard_1 | Education_2 | Education_3 | Family_2 | Family_3 | Family_4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| coef | 0.009666 | 0.933949 | 5.348024 | 0.09585 | -0.048492 | 0.122684 | 1.990504 | 0.381583 | -0.942333 | 3.348377 | -0.524688 | -1.21184 | 2.8462 | 3.151144 | -0.2627 | 1.592229 | 1.130283 |
The coefficients of the logistic regression model are in terms of log(odd), to find the odds we have to take the exponential of the coefficients. Therefore, odds = exp(b) The percentage change in odds is given as odds = (exp(b) - 1) * 100
# converting coefficients to odds
odds = np.exp(lg.coef_[0])
# finding the percentage change
perc_change_odds = (np.exp(lg.coef_[0]) - 1) * 100
# removing limit from number of columns to display
pd.set_option("display.max_columns", None)
# adding the odds to a dataframe
pd.DataFrame({"Odds": odds, "Change_odd%": perc_change_odds}, index=X_train.columns).T
| Experience | Income_2 | Income_3 | Age_2 | Age_3 | CCAvg_2.0 | CCAvg_3.0 | Mortgage_1 | Securities_Account_1 | CD_Account_1 | Online_1 | CreditCard_1 | Education_2 | Education_3 | Family_2 | Family_3 | Family_4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Odds | 1.009713 | 2.544539 | 210.192605 | 1.100594 | 0.952665 | 1.130528 | 7.319222 | 1.464601 | 0.389717 | 28.456500 | 0.591740 | 0.297649 | 17.222213 | 23.362771 | 0.768973 | 4.914691 | 3.096533 |
| Change_odd% | 0.971324 | 154.453876 | 20919.260480 | 10.059390 | -4.733499 | 13.052751 | 631.922226 | 46.460138 | -61.028263 | 2745.650029 | -40.826017 | -70.235098 | 1622.221283 | 2236.277127 | -23.102731 | 391.469135 | 209.653312 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_train, y_train)
log_reg_model_train_perf = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958 | 0.707101 | 0.832753 | 0.7648 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(lg, X_test, y_test)
log_reg_model_test_perf = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test
)
print("Test set performance:")
log_reg_model_test_perf
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.960667 | 0.683099 | 0.873874 | 0.766798 |
logit_roc_auc_train = roc_auc_score(y_train, lg.predict_proba(X_train)[:, 1])
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_train)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
logit_roc_auc_test = roc_auc_score(y_test, lg.predict_proba(X_test)[:, 1])
fpr, tpr, thresholds = roc_curve(y_test, lg.predict_proba(X_test)[:, 1])
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label="Logistic Regression (area = %0.2f)" % logit_roc_auc_test)
plt.plot([0, 1], [0, 1], "r--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic")
plt.legend(loc="lower right")
plt.show()
# Optimal threshold as per AUC-ROC curve
# The optimal cut off would be where tpr is high and fpr is low
fpr, tpr, thresholds = roc_curve(y_train, lg.predict_proba(X_train)[:, 1])
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold_auc_roc = thresholds[optimal_idx]
print(optimal_threshold_auc_roc)
0.09576830262671542
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_train_perf_threshold_auc_roc = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_auc_roc
)
print("Training performance:")
log_reg_model_train_perf_threshold_auc_roc
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.910571 | 0.940828 | 0.520458 | 0.670179 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
# checking model performance for this model
log_reg_model_test_perf_threshold_auc_roc = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_auc_roc
)
print("Test set performance:")
log_reg_model_test_perf_threshold_auc_roc
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.913333 | 0.908451 | 0.52439 | 0.664948 |
The precision of the model for both training and test set has improved but the F1 score has reduced.
y_scores = lg.predict_proba(X_train)[:, 1]
prec, rec, tre = precision_recall_curve(y_train, y_scores,)
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="precision")
plt.plot(thresholds, recalls[:-1], "g--", label="recall")
plt.xlabel("Threshold")
plt.legend(loc="upper left")
plt.ylim([0, 1])
plt.figure(figsize=(10, 7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
At the threshold of 0.38, we get balanced recall and precision.
# setting the threshold
optimal_threshold_curve = 0.38
## creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
log_reg_model_train_perf_threshold_curve = model_performance_classification_sklearn_with_threshold(
lg, X_train, y_train, threshold=optimal_threshold_curve
)
print("Training performance:")
log_reg_model_train_perf_threshold_curve
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.961429 | 0.789941 | 0.806647 | 0.798206 |
# creating confusion matrix
confusion_matrix_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
log_reg_model_test_perf_threshold_curve = model_performance_classification_sklearn_with_threshold(
lg, X_test, y_test, threshold=optimal_threshold_curve
)
print("Test set performance:")
log_reg_model_test_perf_threshold_curve
Test set performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.96 | 0.746479 | 0.815385 | 0.779412 |
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_model_train_perf_threshold_auc_roc.T,
log_reg_model_train_perf_threshold_curve.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.09 Threshold",
"Logistic Regression-0.38 Threshold",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.09 Threshold | Logistic Regression-0.38 Threshold | |
|---|---|---|---|
| Accuracy | 0.958000 | 0.910571 | 0.961429 |
| Recall | 0.707101 | 0.940828 | 0.789941 |
| Precision | 0.832753 | 0.520458 | 0.806647 |
| F1 | 0.764800 | 0.670179 | 0.798206 |
# testing performance comparison
models_test_comp_df = pd.concat(
[
log_reg_model_test_perf.T,
log_reg_model_test_perf_threshold_auc_roc.T,
log_reg_model_test_perf_threshold_curve.T
],
axis=1,
)
models_test_comp_df.columns = [
"Logistic Regression sklearn",
"Logistic Regression-0.09 Threshold",
"Logistic Regression-0.38 Threshold"
]
print("Test set performance comparison:")
models_test_comp_df
Test set performance comparison:
| Logistic Regression sklearn | Logistic Regression-0.09 Threshold | Logistic Regression-0.38 Threshold | |
|---|---|---|---|
| Accuracy | 0.960667 | 0.913333 | 0.960000 |
| Recall | 0.683099 | 0.908451 | 0.746479 |
| Precision | 0.873874 | 0.524390 | 0.815385 |
| F1 | 0.766798 | 0.664948 | 0.779412 |
# creating dummy varibles
dummy_data = pd.get_dummies(
plc,
columns=[
"Income",
"Age",
"CCAvg",
"Mortgage",
"Securities_Account",
"CD_Account",
"Online",
"CreditCard",
"Education",
"Family"
],
)
dummy_data.head()
| Experience | Personal_Loan | Income_1 | Income_2 | Income_3 | Age_1 | Age_2 | Age_3 | CCAvg_1.0 | CCAvg_2.0 | CCAvg_3.0 | Mortgage_0 | Mortgage_1 | Securities_Account_0 | Securities_Account_1 | CD_Account_0 | CD_Account_1 | Online_0 | Online_1 | CreditCard_0 | CreditCard_1 | Education_1 | Education_2 | Education_3 | Family_1 | Family_2 | Family_3 | Family_4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 19 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 15 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 3 | 9 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 4 | 8 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
X = dummy_data.drop("Personal_Loan", axis=1)
Y = dummy_data["Personal_Loan"]
#Printing d train and test data
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 3500 Number of rows in test data = 1500
#printing the % of personal_loans
print("Percentage of Personal_loans in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of Personal_loans in test set:")
print(y_test.value_counts(normalize=True))
Percentage of Personal_loans in training set: 0 0.903429 1 0.096571 Name: Personal_Loan, dtype: float64 Percentage of Personal_loans in test set: 0 0.905333 1 0.094667 Name: Personal_Loan, dtype: float64
Creating common functions to calculate different metrics and confusion matrics.
## Function to calculate recall score
def get_recall_score(model, predictors, target):
"""
model: classifier
predictors: independent variables
target: dependent variable
"""
prediction = model.predict(predictors)
return recall_score(target, prediction)
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
In this case, we can pass a dictionary {0:0.15,1:0.85} to the model to specify the weight of loan given or not and the decision tree will give more weightage to class 1.
class_weight is a hyperparameter for the decision tree classifier.
model = DecisionTreeClassifier(
criterion="gini", class_weight={0: 0.15, 1: 0.85}, random_state=1
)
model.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=1)
#Checking model performance on training set
confusion_matrix_sklearn(model, X_train, y_train)
decision_tree_perf_train = get_recall_score(model, X_train, y_train)
print("Recall Score:", decision_tree_perf_train)
Recall Score: 1.0
confusion_matrix_sklearn(model, X_test, y_test)
decision_tree_perf_test = get_recall_score(model, X_test, y_test)
print("Recall Score:", decision_tree_perf_test)
Recall Score: 0.7816901408450704
## creating a list of column names
feature_names = X_train.columns.to_list()
plt.figure(figsize=(20, 30))
out = tree.plot_tree(
model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
# below code will add arrows to the decision tree split if they are missing
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Text report showing the rules of a decision tree -
print(tree.export_text(model, feature_names=feature_names, show_weights=True))
|--- Income_3 <= 0.50 | |--- CCAvg_3.0 <= 0.50 | | |--- CD_Account_1 <= 0.50 | | | |--- Family_4 <= 0.50 | | | | |--- weights: [259.20, 0.00] class: 0 | | | |--- Family_4 > 0.50 | | | | |--- Education_3 <= 0.50 | | | | | |--- Age_3 <= 0.50 | | | | | | |--- weights: [57.45, 0.00] class: 0 | | | | | |--- Age_3 > 0.50 | | | | | | |--- weights: [17.40, 0.00] class: 0 | | | | |--- Education_3 > 0.50 | | | | | |--- Mortgage_1 <= 0.50 | | | | | | |--- weights: [21.90, 0.00] class: 0 | | | | | |--- Mortgage_1 > 0.50 | | | | | | |--- Experience <= 17.50 | | | | | | | |--- Experience <= 16.50 | | | | | | | | |--- weights: [3.15, 0.00] class: 0 | | | | | | | |--- Experience > 16.50 | | | | | | | | |--- Online_1 <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Online_1 > 0.50 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Experience > 17.50 | | | | | | | |--- weights: [5.25, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- Securities_Account_1 <= 0.50 | | | | |--- Experience <= 26.50 | | | | | |--- Mortgage_1 <= 0.50 | | | | | | |--- weights: [1.95, 0.00] class: 0 | | | | | |--- Mortgage_1 > 0.50 | | | | | | |--- weights: [1.20, 0.00] class: 0 | | | | |--- Experience > 26.50 | | | | | |--- Mortgage_1 <= 0.50 | | | | | | |--- Income_2 <= 0.50 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | |--- Income_2 > 0.50 | | | | | | | |--- Experience <= 38.00 | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | |--- Experience > 38.00 | | | | | | | | |--- Family_4 <= 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Family_4 > 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Mortgage_1 > 0.50 | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | |--- Securities_Account_1 > 0.50 | | | | |--- weights: [8.55, 0.00] class: 0 | |--- CCAvg_3.0 > 0.50 | | |--- CD_Account_1 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Experience <= 33.50 | | | | | |--- Experience <= 5.50 | | | | | | |--- Family_4 <= 0.50 | | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | | |--- Experience <= 4.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 4.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family_4 > 0.50 | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | | |--- Experience > 5.50 | | | | | | |--- Family_2 <= 0.50 | | | | | | | |--- Online_1 <= 0.50 | | | | | | | | |--- Experience <= 19.50 | | | | | | | | | |--- weights: [1.50, 0.00] class: 0 | | | | | | | | |--- Experience > 19.50 | | | | | | | | | |--- Experience <= 26.00 | | | | | | | | | | |--- Education_3 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- Education_3 > 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Experience > 26.00 | | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | |--- Online_1 > 0.50 | | | | | | | | |--- Experience <= 13.50 | | | | | | | | | |--- Age_2 <= 0.50 | | | | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | | | | | | |--- Age_2 > 0.50 | | | | | | | | | | |--- Education_3 <= 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- Education_3 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 13.50 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- weights: [2.85, 0.00] class: 0 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- weights: [1.20, 0.00] class: 0 | | | | | | |--- Family_2 > 0.50 | | | | | | | |--- weights: [4.50, 0.00] class: 0 | | | | |--- Experience > 33.50 | | | | | |--- Online_1 <= 0.50 | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Online_1 > 0.50 | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | |--- Family_3 > 0.50 | | | | |--- Experience <= 31.00 | | | | | |--- Age_2 <= 0.50 | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Age_2 > 0.50 | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | |--- Experience > 31.00 | | | | | |--- weights: [0.45, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- Education_3 <= 0.50 | | | | |--- Experience <= 27.00 | | | | | |--- weights: [0.00, 6.80] class: 1 | | | | |--- Experience > 27.00 | | | | | |--- Age_2 <= 0.50 | | | | | | |--- Experience <= 39.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Experience > 39.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Age_2 > 0.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | |--- Education_3 > 0.50 | | | | |--- weights: [0.15, 0.00] class: 0 |--- Income_3 > 0.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_4 <= 0.50 | | | | |--- Family_3 <= 0.50 | | | | | |--- CD_Account_1 <= 0.50 | | | | | | |--- Experience <= 8.50 | | | | | | | |--- Experience <= 7.50 | | | | | | | | |--- weights: [12.75, 0.00] class: 0 | | | | | | | |--- Experience > 7.50 | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | |--- Family_2 <= 0.50 | | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | | |--- Family_2 > 0.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Experience > 8.50 | | | | | | | |--- Family_2 <= 0.50 | | | | | | | | |--- weights: [25.95, 0.00] class: 0 | | | | | | | |--- Family_2 > 0.50 | | | | | | | | |--- weights: [25.05, 0.00] class: 0 | | | | | |--- CD_Account_1 > 0.50 | | | | | | |--- Family_2 <= 0.50 | | | | | | | |--- CCAvg_3.0 <= 0.50 | | | | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | | | | |--- CCAvg_3.0 > 0.50 | | | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | | | | |--- Experience <= 28.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- Experience > 28.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family_2 > 0.50 | | | | | | | |--- Age_3 <= 0.50 | | | | | | | | |--- weights: [1.35, 0.00] class: 0 | | | | | | | |--- Age_3 > 0.50 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | |--- Family_3 > 0.50 | | | | | |--- Experience <= 32.50 | | | | | | |--- CCAvg_3.0 <= 0.50 | | | | | | | |--- Experience <= 8.50 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | |--- Experience > 8.50 | | | | | | | | |--- Experience <= 19.50 | | | | | | | | | |--- CCAvg_2.0 <= 0.50 | | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | | | |--- CCAvg_2.0 > 0.50 | | | | | | | | | | |--- Experience <= 18.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- Experience > 18.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- Experience > 19.50 | | | | | | | | | |--- weights: [0.00, 6.80] class: 1 | | | | | | |--- CCAvg_3.0 > 0.50 | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | |--- weights: [0.00, 10.20] class: 1 | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Experience > 32.50 | | | | | | |--- Experience <= 35.50 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | |--- Experience > 35.50 | | | | | | | |--- Experience <= 36.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Experience > 36.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | |--- Family_4 > 0.50 | | | | |--- weights: [0.00, 20.40] class: 1 | | |--- Education_2 > 0.50 | | | |--- CCAvg_3.0 <= 0.50 | | | | |--- Experience <= 40.50 | | | | | |--- Family_3 <= 0.50 | | | | | | |--- Experience <= 26.50 | | | | | | | |--- Experience <= 25.50 | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | |--- Experience <= 12.00 | | | | | | | | | | |--- Family_4 <= 0.50 | | | | | | | | | | | |--- weights: [0.00, 5.95] class: 1 | | | | | | | | | | |--- Family_4 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | |--- Experience > 12.00 | | | | | | | | | | |--- Experience <= 18.50 | | | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | | | | |--- Experience > 18.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | |--- Experience <= 10.00 | | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | | | |--- Experience > 10.00 | | | | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | |--- Experience > 25.50 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | |--- Experience > 26.50 | | | | | | | |--- Experience <= 37.00 | | | | | | | | |--- weights: [0.00, 11.05] class: 1 | | | | | | | |--- Experience > 37.00 | | | | | | | | |--- Experience <= 39.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Experience > 39.00 | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | |--- Family_3 > 0.50 | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | |--- Online_1 <= 0.50 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- Experience <= 4.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- Experience > 4.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | | |--- Online_1 > 0.50 | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | |--- Experience <= 24.00 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | | | | |--- Experience > 24.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | |--- Age_2 <= 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Age_2 > 0.50 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | |--- Experience > 40.50 | | | | | |--- weights: [0.45, 0.00] class: 0 | | | |--- CCAvg_3.0 > 0.50 | | | | |--- Experience <= 35.50 | | | | | |--- weights: [0.00, 61.20] class: 1 | | | | |--- Experience > 35.50 | | | | | |--- Experience <= 36.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Experience > 36.50 | | | | | | |--- Online_1 <= 0.50 | | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Online_1 > 0.50 | | | | | | | |--- weights: [0.00, 4.25] class: 1 | |--- Education_3 > 0.50 | | |--- Experience <= 40.00 | | | |--- CCAvg_3.0 <= 0.50 | | | | |--- Experience <= 1.50 | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | |--- Experience > 1.50 | | | | | |--- Experience <= 35.50 | | | | | | |--- Experience <= 2.50 | | | | | | | |--- Family_4 <= 0.50 | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Family_4 > 0.50 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | |--- Experience > 2.50 | | | | | | | |--- Experience <= 16.50 | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | |--- Experience <= 13.50 | | | | | | | | | | |--- Family_2 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- Family_2 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | |--- Experience > 13.50 | | | | | | | | | | |--- Family_2 <= 0.50 | | | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | | | | |--- Family_2 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | | | |--- Experience <= 3.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- Experience > 3.50 | | | | | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | | | | |--- Experience > 16.50 | | | | | | | | |--- CCAvg_2.0 <= 0.50 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- Experience <= 30.50 | | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | | |--- Experience > 30.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- CCAvg_2.0 > 0.50 | | | | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | | |--- weights: [0.00, 7.65] class: 1 | | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 4.25] class: 1 | | | | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | | | | |--- Family_3 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- Family_3 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | |--- Experience > 35.50 | | | | | | |--- Family_4 <= 0.50 | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | |--- Experience <= 38.00 | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | |--- Experience > 38.00 | | | | | | | | | |--- Family_3 <= 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Family_3 > 0.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family_4 > 0.50 | | | | | | | |--- weights: [0.75, 0.00] class: 0 | | | |--- CCAvg_3.0 > 0.50 | | | | |--- Family_2 <= 0.50 | | | | | |--- weights: [0.00, 56.95] class: 1 | | | | |--- Family_2 > 0.50 | | | | | |--- Experience <= 24.50 | | | | | | |--- Experience <= 22.50 | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | |--- Experience <= 10.50 | | | | | | | | | |--- Mortgage_1 <= 0.50 | | | | | | | | | | |--- CD_Account_1 <= 0.50 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | | |--- CD_Account_1 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | |--- Mortgage_1 > 0.50 | | | | | | | | | | |--- Experience <= 4.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | | | |--- Experience > 4.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 10.50 | | | | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | |--- weights: [0.00, 5.10] class: 1 | | | | | | |--- Experience > 22.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Experience > 24.50 | | | | | | |--- weights: [0.00, 11.05] class: 1 | | |--- Experience > 40.00 | | | |--- CCAvg_2.0 <= 0.50 | | | | |--- weights: [0.30, 0.00] class: 0 | | | |--- CCAvg_2.0 > 0.50 | | | | |--- weights: [0.15, 0.00] class: 0
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
model.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income_3 0.539183 Education_2 0.094826 Family_3 0.088294 Family_4 0.061003 CCAvg_3.0 0.058491 Experience 0.058265 Education_3 0.041124 CD_Account_1 0.019050 Online_1 0.008827 Mortgage_1 0.008468 Family_2 0.006638 CreditCard_1 0.005026 Securities_Account_1 0.003436 Age_2 0.002841 CCAvg_2.0 0.002761 Income_2 0.001054 Age_3 0.000713
importances = model.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# Choose the type of classifier.
estimator = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
# Grid of parameters to choose from
parameters = {
"max_depth": [5, 10, 15, None],
"criterion": ["entropy", "gini"],
"splitter": ["best", "random"],
"min_impurity_decrease": [0.00001, 0.0001, 0.01],
}
# Type of scoring used to compare parameter combinations
scorer = make_scorer(recall_score)
# Run the grid search
grid_obj = GridSearchCV(estimator, parameters, scoring=scorer, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
estimator.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, max_depth=5,
min_impurity_decrease=0.01, random_state=1)
confusion_matrix_sklearn(estimator, X_train, y_train)
decision_tree_tune_perf_train = get_recall_score(estimator, X_train, y_train)
print("Recall Score:", decision_tree_tune_perf_train)
Recall Score: 0.9822485207100592
#Checking performance on test set
confusion_matrix_sklearn(estimator, X_test, y_test)
decision_tree_tune_perf_test = get_recall_score(estimator, X_test, y_test)
print("Recall Score:", decision_tree_tune_perf_test)
Recall Score: 0.9788732394366197
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
estimator,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(estimator, feature_names=feature_names, show_weights=True))
|--- Income_3 <= 0.50 | |--- CCAvg_3.0 <= 0.50 | | |--- weights: [377.85, 2.55] class: 0 | |--- CCAvg_3.0 > 0.50 | | |--- weights: [15.00, 18.70] class: 1 |--- Income_3 > 0.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_4 <= 0.50 | | | | |--- Family_3 <= 0.50 | | | | | |--- weights: [68.25, 2.55] class: 0 | | | | |--- Family_3 > 0.50 | | | | | |--- weights: [1.95, 22.95] class: 1 | | | |--- Family_4 > 0.50 | | | | |--- weights: [0.00, 20.40] class: 1 | | |--- Education_2 > 0.50 | | | |--- weights: [5.40, 100.30] class: 1 | |--- Education_3 > 0.50 | | |--- weights: [5.85, 119.85] class: 1
Using the above extracted decision rules we can make interpretations from the decision tree model like:
If a customer has less than or equal to 500K income and CCAvg is less than 3K then customer has potential chanes of taking personal loan
If a customers income is more 500K, education is 1 type, whose family is 4 then the customer is not a good candidate for the personal loan. Chances of default is high
Interpretations from other decision rules can be made similarly
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
estimator.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
# Here we will see that importance of features has increased
Imp Income_3 0.629640 Education_2 0.110735 Family_3 0.094377 CCAvg_3.0 0.060762 Family_4 0.059097 Education_3 0.045389 Online_1 0.000000 Family_2 0.000000 CreditCard_1 0.000000 Experience 0.000000 CD_Account_1 0.000000 Income_2 0.000000 Mortgage_1 0.000000 CCAvg_2.0 0.000000 Age_3 0.000000 Age_2 0.000000 Securities_Account_1 0.000000
importances = estimator.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
The DecisionTreeClassifier provides parameters such as min_samples_leaf and max_depth to prevent a tree from overfiting. Cost complexity pruning provides another option to control the size of a tree. In DecisionTreeClassifier, this pruning technique is parameterized by the cost complexity parameter, ccp_alpha. Greater values of ccp_alpha increase the number of nodes pruned. Here we only show the effect of ccp_alpha on regularizing the trees and how to choose a ccp_alpha based on validation scores.
Minimal cost complexity pruning recursively finds the node with the "weakest link". The weakest link is characterized by an effective alpha, where the nodes with the smallest effective alpha are pruned first. To get an idea of what values of ccp_alpha could be appropriate, scikit-learn provides DecisionTreeClassifier.cost_complexity_pruning_path that returns the effective alphas and the corresponding total leaf impurities at each step of the pruning process. As alpha increases, more of the tree is pruned, which increases the total impurity of its leaves
clf = DecisionTreeClassifier(random_state=1, class_weight={0: 0.15, 1: 0.85})
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.000000e+00 | 0.000335 |
| 1 | 1.311976e-19 | 0.000335 |
| 2 | 7.434529e-19 | 0.000335 |
| 3 | 7.434529e-19 | 0.000335 |
| 4 | 7.434529e-19 | 0.000335 |
| ... | ... | ... |
| 75 | 5.283568e-03 | 0.067788 |
| 76 | 2.443063e-02 | 0.092219 |
| 77 | 3.085352e-02 | 0.153926 |
| 78 | 3.138616e-02 | 0.216698 |
| 79 | 2.531581e-01 | 0.469856 |
80 rows × 2 columns
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.show()
Training decision tree using the effective alphas. The last value in ccp_alphas is the alpha value that prunes the whole tree, leaving the tree, clfs[-1], with one node
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(
random_state=1, ccp_alpha=ccp_alpha, class_weight={0: 0.15, 1: 0.85}
)
clf.fit(X_train, y_train)
clfs.append(clf)
print(
"Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1]
)
)
Number of nodes in the last tree is: 1 with ccp_alpha: 0.2531581278404188
Remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node. The number of nodes and tree depth decreases as alpha increases.
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(10, 7))
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
recall_train = []
for clf in clfs:
pred_train = clf.predict(X_train)
values_train = recall_score(y_train, pred_train)
recall_train.append(values_train)
recall_test = []
for clf in clfs:
pred_test = clf.predict(X_test)
values_test = recall_score(y_test, pred_test)
recall_test.append(values_test)
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(
ccp_alphas, recall_train, marker="o", label="train", drawstyle="steps-post",
)
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()
Maximum value of Recall is at 0.030 alpha, rathan this i would like to choose alpha 0.005, this helps to retaining retaining information and getting higher recall.
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.005283567915541212,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
best_model.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.005283567915541212,
class_weight={0: 0.15, 1: 0.85}, random_state=1)
confusion_matrix_sklearn(best_model, X_train, y_train)
print("Recall Score:", get_recall_score(best_model, X_train, y_train))
Recall Score: 0.9822485207100592
confusion_matrix_sklearn(best_model,X_test, y_test)
print("Recall Score:", get_recall_score(best_model, X_test, y_test))
Recall Score: 0.9788732394366197
plt.figure(figsize=(5, 5))
out = tree.plot_tree(
best_model,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
best_model2 = DecisionTreeClassifier(
ccp_alpha=0.005, class_weight={0: 0.15, 1: 0.85}, random_state=1
)
best_model2.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.005, class_weight={0: 0.15, 1: 0.85},
random_state=1)
confusion_matrix_sklearn(best_model2, X_train, y_train)
decision_tree_postpruned_perf_train = get_recall_score(best_model2, X_train, y_train)
print("Recall Score:", decision_tree_postpruned_perf_train)
Recall Score: 0.9467455621301775
confusion_matrix_sklearn(best_model2, X_test, y_test)
decision_tree_postpruned_perf_test = get_recall_score(best_model2, X_test, y_test)
print("Recall Score:", decision_tree_postpruned_perf_test)
Recall Score: 0.8943661971830986
plt.figure(figsize=(15, 10))
out = tree.plot_tree(
best_model2,
feature_names=feature_names,
filled=True,
fontsize=9,
node_ids=False,
class_names=None,
)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor("black")
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model2, feature_names=feature_names, show_weights=True))
|--- Income_3 <= 0.50 | |--- CCAvg_3.0 <= 0.50 | | |--- weights: [377.85, 2.55] class: 0 | |--- CCAvg_3.0 > 0.50 | | |--- CD_Account_1 <= 0.50 | | | |--- weights: [14.70, 10.20] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- weights: [0.30, 8.50] class: 1 |--- Income_3 > 0.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_4 <= 0.50 | | | | |--- Family_3 <= 0.50 | | | | | |--- weights: [68.25, 2.55] class: 0 | | | | |--- Family_3 > 0.50 | | | | | |--- weights: [1.95, 22.95] class: 1 | | | |--- Family_4 > 0.50 | | | | |--- weights: [0.00, 20.40] class: 1 | | |--- Education_2 > 0.50 | | | |--- weights: [5.40, 100.30] class: 1 | |--- Education_3 > 0.50 | | |--- weights: [5.85, 119.85] class: 1
# importance of features in the tree building ( The importance of a feature is computed as the
# (normalized) total reduction of the 'criterion' brought by that feature. It is also known as the Gini importance )
print(
pd.DataFrame(
best_model2.feature_importances_, columns=["Imp"], index=X_train.columns
).sort_values(by="Imp", ascending=False)
)
Imp Income_3 0.621473 Education_2 0.109298 Family_3 0.093153 CCAvg_3.0 0.059974 Family_4 0.058330 Education_3 0.044800 CD_Account_1 0.012971 Online_1 0.000000 Family_2 0.000000 CreditCard_1 0.000000 Experience 0.000000 Income_2 0.000000 Mortgage_1 0.000000 CCAvg_2.0 0.000000 Age_3 0.000000 Age_2 0.000000 Securities_Account_1 0.000000
importances = best_model2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
comparison_frame = pd.DataFrame({'Model':['Initial decision tree model','Decision tree with restricted maximum depth','Decision treee with hyperparameter tuning',
'Decision tree with post-pruning'], 'Train_Recall':[1,0.98,0.97,0.95], 'Test_Recall':[0.78,0.97,0.97,0.91]})
comparison_frame
| Model | Train_Recall | Test_Recall | |
|---|---|---|---|
| 0 | Initial decision tree model | 1.00 | 0.78 |
| 1 | Decision tree with restricted maximum depth | 0.98 | 0.97 |
| 2 | Decision treee with hyperparameter tuning | 0.97 | 0.97 |
| 3 | Decision tree with post-pruning | 0.95 | 0.91 |